{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Authors: Andreas Haupt, Alexander Quispe, Anzony Quispe, Vasilis Syrgkanis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_execution_state": "idle",
    "_uuid": "051d70d956493feee0c6d64651c6a088724dca2a",
    "papermill": {
     "duration": 0.010774,
     "end_time": "2021-02-15T11:01:41.782833",
     "exception": false,
     "start_time": "2021-02-15T11:01:41.772059",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Penalized Linear Regressions: A Simulation Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "import math\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "random.seed(42)\n",
    "import warnings\n",
    "warnings.simplefilter('ignore')\n",
    "from sklearn.linear_model import LassoCV\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import RidgeCV, ElasticNetCV, LinearRegression, Lasso, Ridge\n",
    "from sklearn.metrics import r2_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import pandas as pd\n",
    "from sklearn.base import BaseEstimator, clone"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Generating Process"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We define a simple data generating process that contains both linear and non-linear components and allows for both sparse and dense coefficients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gen_data(n, p, *, sparse=True):\n",
    "    if sparse:\n",
    "        beta = (1 / np.arange(1, p)) ** 2\n",
    "    else:\n",
    "        beta = ((np.random.normal(0, 1, p - 1)) * 0.2)\n",
    "    true_fn = lambda x: np.exp(4 * x[:, 0]) + (x[:, 1:] @ beta)\n",
    "    X = np.random.uniform(-.5, .5, size=(n, p))\n",
    "    gX = true_fn(X)\n",
    "    y = gX + np.random.normal(0, 1, size=n)\n",
    "    Xtest = np.random.uniform(-.5, .5, size=(n, p))\n",
    "    gXtest = true_fn(Xtest)\n",
    "    ytest = gXtest + np.random.normal(0, 1, size=n)\n",
    "    Xpop = np.random.uniform(-.5, .5, size=(100000, p)) # almost population limit\n",
    "    gXpop = true_fn(Xpop)\n",
    "    ypop = gXpop + np.random.normal(0, 1, size=100000) # almost population limit\n",
    "    return X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.010616,
     "end_time": "2021-02-15T11:01:41.804126",
     "exception": false,
     "start_time": "2021-02-15T11:01:41.793510",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Data Generating Process: Approximately Sparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = 100\n",
    "p = 400\n",
    "X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop = gen_data(n, p, sparse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "plt.title(r\"$Y$ vs. $g(X)$\")\n",
    "plt.scatter(gX, y)\n",
    "plt.xlabel(r\"$g(X)$\")\n",
    "plt.ylabel(r\"$Y$\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"theoretical R^2:, {1 - np.var(ypop - gXpop) / np.var(ypop)}\")\n",
    "print(f\"theoretical R^2:, {np.var(gXpop) / np.var(ypop)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Pre-Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We create non-linear features of the first regressor. We standarize our features so that penalization does not favor different features asymmetrically."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "poly = lambda x: np.hstack([x[:, [0]], x[:, [0]]**2, x[:, [0]]**3, x[:, 1:]])\n",
    "scaler = StandardScaler()\n",
    "X = scaler.fit_transform(poly(X))\n",
    "Xtest = scaler.transform(poly(Xtest))\n",
    "Xpop = scaler.transform(poly(Xpop))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lasso, Ridge, ElasticNet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use sklearn's penalized estimators, which choose the penalty parameter via cross-validation (by default 5-fold cross-validation). These methods search over an adaptively chosen grid of hyperparameters. `ElasticNet` allows for a convex combination of `l1` and `l2` penalty and the ratio with `l1_ratio` corresponding to the proportion of the `l1` penalty."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regressions\n",
    "lcv = LassoCV().fit(X, y)\n",
    "ridge = RidgeCV().fit(X, y)\n",
    "enet = ElasticNetCV(l1_ratio = 0.5).fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We calculate the R-squared on the small test set that we have"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_lcv = r2_score(ytest, lcv.predict(Xtest))\n",
    "r2_ridge = r2_score(ytest, ridge.predict(Xtest))\n",
    "r2_enet = r2_score(ytest, enet.predict(Xtest))\n",
    "r2_lcv, r2_ridge, r2_enet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We also calculate what the R-squared would be in the population limit (in our case for practical purposes when we have a very very large test sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_lcv = r2_score(ypop, lcv.predict(Xpop))\n",
    "r2_ridge = r2_score(ypop, ridge.predict(Xpop))\n",
    "r2_enet = r2_score(ypop, enet.predict(Xpop))\n",
    "r2_lcv, r2_ridge, r2_enet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## This is the wrong post lasso OLS with cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PostLassoOLS:\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        lasso = LassoCV().fit(X, y)\n",
    "        self.feats_ = np.abs(lasso.coef_) > 1e-6\n",
    "        self.lr_ = LinearRegression().fit(X[:, self.feats_], y)\n",
    "        return self\n",
    "\n",
    "    def predict(self, X):\n",
    "        return self.lr_.predict(X[:, self.feats_])\n",
    "\n",
    "    @property\n",
    "    def coef_(self):\n",
    "        return self.lr_.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plols = PostLassoOLS().fit(X, y)\n",
    "r2_score(ypop, plols.predict(Xpop))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plug-in Hyperparameter Lasso and Post-Lasso OLS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.01429,
     "end_time": "2021-02-15T11:01:45.388902",
     "exception": false,
     "start_time": "2021-02-15T11:01:45.374612",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\\\n",
    "The post lasso function makes OLS with the selected ***T*** regressors.\n",
    "To select those parameters, they use $\\lambda$ as variable to penalize\\\n",
    "**Funny thing: the function rlasso was named like that because it is the \"rigorous\" Lasso.**\\\n",
    "We find a Python code that tries to replicate the main function of hdm r-package. It was made by [Max Huppertz](https://maxhuppertz.github.io/code/). His library is this [repository](https://github.com/maxhuppertz/hdmpy). Download its repository and copy this folder to your site-packages folder. In my case it is located here ***C:\\Python\\Python38\\Lib\\site-packages*** . We need to install this package ***pip install multiprocess***."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We wrap the package so that it has the familiar sklearn API\n",
    "import hdmpy\n",
    "\n",
    "class RLasso(BaseEstimator):\n",
    "    \n",
    "    def __init__(self, *, post=True):\n",
    "        self.post = post\n",
    "    \n",
    "    def fit(self, X, y):\n",
    "        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)\n",
    "        return self\n",
    "\n",
    "    def predict(self, X):\n",
    "        return X @ np.array(self.rlasso_.est['beta']).flatten() + np.array(self.rlasso_.est['intercept'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rlasso = RLasso(post = False).fit(X, y)\n",
    "rlasso_post = RLasso(post = True).fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_rlasso = r2_score(ytest, rlasso.predict(Xtest))\n",
    "r2_rlasso_post = r2_score(ytest, rlasso_post.predict(Xtest))\n",
    "r2_rlasso, r2_rlasso_post"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_rlasso = r2_score(ypop, rlasso.predict(Xpop))\n",
    "r2_rlasso_post = r2_score(ypop, rlasso_post.predict(Xpop))\n",
    "r2_rlasso, r2_rlasso_post"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LAVA: Dense + Sparse Coefficients"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's try the LAVA estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We construct an sklearn API estimator that implements the LAVA method\n",
    "\n",
    "class Lava(BaseEstimator):\n",
    "    \n",
    "    def __init__(self, *, alpha1=1, alpha2=1, iterations=5):\n",
    "        self.alpha1 = alpha1 # l1 penalty\n",
    "        self.alpha2 = alpha2\n",
    "        self.iterations = iterations\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        ridge = Ridge(self.alpha2).fit(X, y)\n",
    "        lasso = Lasso(self.alpha1).fit(X, y - ridge.predict(X))\n",
    "\n",
    "        for _ in range(self.iterations - 1):\n",
    "            ridge = ridge.fit(X, y - lasso.predict(X))\n",
    "            lasso = lasso.fit(X, y - ridge.predict(X))\n",
    "\n",
    "        self.lasso_ = lasso\n",
    "        self.ridge_ = ridge\n",
    "        return self\n",
    "\n",
    "    def predict(self, X):\n",
    "        return self.lasso_.predict(X) + self.ridge_.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lava = GridSearchCV(Lava(), {'alpha1': np.logspace(-4, 4, 20), 'alpha2': np.logspace(-4, 4, 20)},\n",
    "                    scoring='r2', n_jobs=-1)\n",
    "lava.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lava.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_lava = r2_score(ytest, lava.predict(Xtest))\n",
    "r2_lava"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_lava = r2_score(ypop, lava.predict(Xpop))\n",
    "r2_lava"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summarizing Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= pd.DataFrame({'LassoCV': [r2_lcv],\n",
    "                  'RidgeCV': [r2_ridge],\n",
    "                  'ElasticNetCV': [r2_enet],\n",
    "                  'RLasso': [r2_rlasso],\n",
    "                  'RLassoOLS': [r2_rlasso_post],\n",
    "                  'Lava': [r2_lava]}).T\n",
    "df.columns = ['Population R-squared']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "plt.scatter(gXtest, gXtest, marker = '.', c = 'black' )\n",
    "plt.scatter(gXtest, rlasso.predict(Xtest), marker = 'D' , c = 'red' , label = 'RLasso' )\n",
    "plt.scatter(gXtest, rlasso_post.predict(Xtest) , marker = '^' , c = 'green' , label = 'RLassoOLS')\n",
    "plt.scatter(gXtest, lcv.predict(Xtest) , marker = 'o' , c = 'blue' , label = 'LassoCV')\n",
    "plt.legend(loc='lower right')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.018842,
     "end_time": "2021-02-15T11:02:51.941852",
     "exception": false,
     "start_time": "2021-02-15T11:02:51.923010",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Data Generating Process: Approximately Sparse + Small Dense Part"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = 100\n",
    "p = 400\n",
    "X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop = gen_data(n, p, sparse=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"theoretical R^2:, {1 - np.var(ypop - gXpop) / np.var(ypop)}\")\n",
    "print(f\"theoretical R^2:, {np.var(gXpop) / np.var(ypop)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "poly = lambda x: np.hstack([x[:, [0]], x[:, [0]]**2, x[:, [0]]**3, x[:, 1:]])\n",
    "scaler = StandardScaler()\n",
    "X = scaler.fit_transform(poly(X))\n",
    "Xtest = scaler.transform(poly(Xtest))\n",
    "Xpop = scaler.transform(poly(Xpop))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regressions\n",
    "lcv = LassoCV().fit(X, y)\n",
    "ridge = RidgeCV().fit(X, y)\n",
    "enet = ElasticNetCV(l1_ratio = 0.5).fit(X, y)\n",
    "rlasso = RLasso(post = False).fit(X, y)\n",
    "rlasso_post = RLasso(post = True).fit(X, y)\n",
    "lava = GridSearchCV(Lava(), {'alpha1': np.logspace(-4, 4, 20), 'alpha2': np.logspace(-4, 4, 20)},\n",
    "                    scoring='r2', n_jobs=-1).fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r2_lcv = r2_score(ypop, lcv.predict(Xpop))\n",
    "r2_ridge = r2_score(ypop, ridge.predict(Xpop))\n",
    "r2_enet = r2_score(ypop, enet.predict(Xpop))\n",
    "r2_rlasso = r2_score(ypop, rlasso.predict(Xpop))\n",
    "r2_rlasso_post = r2_score(ypop, rlasso_post.predict(Xpop))\n",
    "r2_lava = r2_score(ypop, lava.predict(Xpop))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= pd.DataFrame({'LassoCV': [r2_lcv],\n",
    "                  'RidgeCV': [r2_ridge],\n",
    "                  'ElasticNetCV': [r2_enet],\n",
    "                  'RLasso': [r2_rlasso],\n",
    "                  'RLassoOLS': [r2_rlasso_post],\n",
    "                  'Lava': [r2_lava]}).T\n",
    "df.columns = ['Population R-squared']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "plt.scatter(gXtest, gXtest, marker = '.', c = 'black' )\n",
    "plt.scatter(gXtest, rlasso.predict(Xtest), marker = 'D' , c = 'red' , label = 'RLasso' )\n",
    "plt.scatter(gXtest, rlasso_post.predict(Xtest) , marker = '^' , c = 'green' , label = 'RLassoOLS')\n",
    "plt.scatter(gXtest, lcv.predict(Xtest) , marker = 'o' , c = 'blue' , label = 'LassoCV')\n",
    "plt.scatter(gXtest, lava.predict(Xtest) , marker = 'o' , c = 'magenta' , label = 'Lava')\n",
    "plt.legend(loc='lower right')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
